home *** CD-ROM | disk | FTP | other *** search
- #!/bin/sh
- echo -n "Extracting all words from " >&2
- (
- for f in $*; do
- echo -n "$f " >&2
- cat $f |
- sed 's/\\%/ /g' |
- sed 's/%.*//' |
- sed 's/\\begin{[^}]*}//g' |
- sed 's/\\end{[^}]*}//g' |
- sed 's/\\ref{[^}]*}//g' |
- sed 's/\\namedlabel{[^}]*}{[^}]*}/ /g' |
- sed 's/\\label{[^}]*}//g' |
- sed 's/\\index{[^}]*}//g' |
- sed 's/\\[_$#\]/ /g' |
- sed 's/\\[a-z]*/ /g' |
- tr '{}()[]<>|\\,.:;@^%/~?!&$=*_#"`'"'-" " " |
- tr '[A-Z]' '[a-z]' |
- sed 's/[ ]/
- /g' |
- grep -v '^$' |
- grep -v '^-' |
- sort |
- uniq
- done
- echo >&2
- ) | sort | uniq > /tmp/words.all
-
- # Define function for word counts
- function delta {
- w1=`wc -l < $1`
- w2=`wc -l < $2`
- expr $w1 - $w2
- }
-
- cp /tmp/words.all /tmp/words.0
- echo "Removing derivative words" >&2
- echo -n "...numbers: "
- grep -v '^[-0-9]*$' /tmp/words.all |
- grep -v '^0x[0-9a-f]*$' > /tmp/w1.$$
- delta /tmp/words.all /tmp/w1.$$
- mv /tmp/w1.$$ /tmp/words.all
-
- echo -n "...ing: "
- cat /tmp/words.all |
- sed 'h
- s/$/ing/p
- g
- s/.$/&&ing/p
- g
- s/ie$/ying/p
- g
- s/e$/ing/p
- d' | sort > /tmp/w0.$$
- comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
- delta /tmp/words.all /tmp/w1.$$
- mv /tmp/w1.$$ /tmp/words.all
-
- echo -n "...s: "
- cat /tmp/words.all |
- sed 'h
- s/$/s/p
- g
- s/$/es/p
- g
- s/y$/ies/p
- g
- s/s$/ses/p
- d' | sort > /tmp/w0.$$
- comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
- echo `delta /tmp/words.all /tmp/w1.$$`
- mv /tmp/w1.$$ /tmp/words.all
-
- echo -n "...d: "
- cat /tmp/words.all |
- sed 'h
- s/$/d/p
- g
- s/$/ed/p
- g
- s/.$/&&ed/p
- g
- s/y$/ied/p
- d' | sort > /tmp/w0.$$
- comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
- echo `delta /tmp/words.all /tmp/w1.$$`
- mv /tmp/w1.$$ /tmp/words.all
-
- echo -n "...ly: "
- cat /tmp/words.all |
- sed 'h
- s/y$/ily/p
- g
- s/$/ly/p
- g
- s/ble/bly/p
- d' | sort > /tmp/w0.$$
- comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
- delta /tmp/words.all /tmp/w1.$$
- mv /tmp/w1.$$ /tmp/words.all
-
- echo -n "...ion: "
- cat /tmp/words.all |
- sed 'h
- s/$/ion/p
- g
- s/te$/tion/p
- g
- s/de$/sion/p
- g
- s/te$/tions/p
- g
- s/de$/sions/p
- d' | sort > /tmp/w0.$$
- comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
- delta /tmp/words.all /tmp/w1.$$
- mv /tmp/w1.$$ /tmp/words.all
-
- rm -f /tmp/w0.$$ /tmp/w1.$$
- mv /tmp/words.all words.all
-
- echo "extracting probable misspellings"
- comm -23 words.all /usr/dict/words > words.bad
-